Some additional reading on statistical analysis of RNA-seq: http://www.nathalievialaneix.eu/doc/pdf/tutorial-rnaseq.pdf
# pacman is a good package for loading packages
library(pacman)
pacman::p_load(edgeR, RColorBrewer, gplots, ggplot2, reshape2, DT, cowplot,
limma, DESeq, DESeq2, data.table, e1071, ComplexHeatmap)
I have writen several functions in an external R script, we can use those function by sourcing the function
source("https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/RNA_seq_tutorial_functions.R")
This data came out of Kallisto software, gene counts are the sum of trancript pseudocounts. These counts are not in round numbers, so we need to round them to integers first.
Alternatively, you should be able to generate count data using HTSeq https://htseq.readthedocs.io/en/release_0.10.0/ or featurecounts http://bioinf.wehi.edu.au/featureCounts/. Please do not use bedtools as it usually counts ambiguous alignments.
Here I created my own transcriptome assembly using StringTie https://ccb.jhu.edu/software/stringtie/, hense the “MSTRG” gene_id in the count data.
The reason I did that was because I not only want to study genes found in the reference genome, but also characterize new transcripts.
In order to know gene functions, we can convert MSTRG id into maize reference genome id (v4), which is already well characterized.
require(data.table)
data <- fread("https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/genes.txt", sep = "\t")
data[,6:15] = round(data[,6:15])
id_conv <- fread("https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/all_MSTRG_v4_v3.csv", header = TRUE, sep = ",")
id_conv <- data.frame(id_conv, row.names = 1)
functions <- fread("https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/B73v4_gene_function.txt", header = FALSE, sep = "\t")
functions <- subset(functions, select = c("V1", "V2"))
colnames(id_conv) <- c("v4_gene_id", "v3_gene_id", "MSTRG_gene_id")
colnames(functions) <- c("v4_gene_id", "gene_function")
annotations = merge(id_conv, functions, by = "v4_gene_id", all.x = TRUE)
data <- merge(data, annotations, by = "MSTRG_gene_id", all.x = TRUE)
data <- aggregate(data,
by = list(data$MSTRG_gene_id),
FUN = aggregate_func
)
data <- na.omit(data, cols="v4_gene_id")
data <- data.frame(data, row.names = 1)
setnames(data, old = c("UU1", "UU2", "UU3", "UU4", "UU5", "WW1", "WW2", "WW3", "WW4", "WW5"),
new = c("U1", "U2","U3","U4","U5", "W1", "W2","W3","W4","W5"))
head(data)
## MSTRG_gene_id chr_name start end strand U1 U2 U3
## MSTRG.1000 MSTRG.1000 chr1 25047512 25050032 - 195 333 154
## MSTRG.10002 MSTRG.10002 chr10 32585811 32588982 - 154 244 273
## MSTRG.10005 MSTRG.10005 chr10 32815479 32817141 + 0 0 0
## MSTRG.10007 MSTRG.10007 chr10 32529623 32567490 - 419 381 397
## MSTRG.1001 MSTRG.1001 chr1 25152267 25155144 + 139 152 150
## MSTRG.10011 MSTRG.10011 chr10 33226451 33367505 - 609 523 514
## U4 U5 W1 W2 W3 W4 W5 v4_gene_id v3_gene_id
## MSTRG.1000 166 105 117 52 126 123 81 Zm00001d028167 GRMZM2G003506
## MSTRG.10002 106 242 226 212 274 218 212 Zm00001d023969 GRMZM2G425986
## MSTRG.10005 0 0 0 0 0 1 0 Zm00001d023973 GRMZM2G097286
## MSTRG.10007 223 395 423 345 406 435 375 Zm00001d023968 GRMZM2G124974
## MSTRG.1001 81 190 167 92 110 182 148 Zm00001d028171 GRMZM2G071374
## MSTRG.10011 274 595 508 320 421 375 399 Zm00001d023974 GRMZM2G027431
## gene_function
## MSTRG.1000 Cytochrome b561 and DOMON domain-containing protein
## MSTRG.10002 A/G-specific adenine DNA glycosylase
## MSTRG.10005 Cysteine proteinases superfamily protein
## MSTRG.10007 Glutathione S-transferase
## MSTRG.1001 Calcium-dependent lipid-binding (CaLB domain) family protein
## MSTRG.10011 Putative endonuclease or glycosyl hydrolase
counts <- data[,6:15]
require(e1071)
data_diagnosis(counts)
## skewness is:
## 36.63689 41.82896 21.72463 34.99034 22.78793 42.0665 33.39914 37.64356 27.01962 54.97658
## kurtosis is:
## 2199.034 3028.689 865.9139 1964.686 867.6066 2758.378 1949.117 2319.712 1375.454 4894.567
Data is skewed (high skewness and high kurtosis).
We need to do data transformations for PCA plot, otherwise outliers will have a great impact on the clustering.
log transformation: logcounts = log2(counts + 1).
DESeq2::rlog(): “regularized log” transformation. For more information see https://rdrr.io/bioc/DESeq2/man/rlog.html
edgeR::cpm(): “counts per million” transformation. For more information see https://rdrr.io/bioc/edgeR/man/cpm.html
DESeq2:varianceStabilizingTransformation(): “variance stabilizing transformation”. For more information see https://rdrr.io/bioc/DESeq2/man/varianceStabilizingTransformation.html
require(DESeq2)
require(edgeR)
logcounts = log2(counts + 1)
rlogcounts = rlog(as.matrix(counts))
rownames(rlogcounts) = rownames(logcounts)
cpmcounts = cpm(as.matrix(counts), prior.count = 2, log = TRUE)
vstcounts = varianceStabilizingTransformation(as.matrix(counts))
data_diagnosis(logcounts)
## skewness is:
## -0.5997057 -0.6048998 -0.6093726 -0.4912253 -0.6109366 -0.5945748 -0.5262748 -0.5730369 -0.5900014 -0.552007
## kurtosis is:
## -0.7826695 -0.7977504 -0.8067969 -0.9002022 -0.7725521 -0.7831297 -0.9279888 -0.8513029 -0.8382193 -0.8962603
data_diagnosis(rlogcounts)
## skewness is:
## -0.6679657 -0.6836715 -0.6832402 -0.6763476 -0.6779365 -0.6773912 -0.6746331 -0.6697096 -0.6842158 -0.671056
## kurtosis is:
## -0.6403723 -0.6370364 -0.6450905 -0.6332838 -0.633821 -0.6331829 -0.6479403 -0.6440739 -0.6427632 -0.6518892
data_diagnosis(cpmcounts)
## skewness is:
## -0.4285089 -0.4730039 -0.4773158 -0.4526601 -0.4639736 -0.4572871 -0.4528065 -0.4374913 -0.4839089 -0.438765
## kurtosis is:
## -0.9539685 -0.9381164 -0.9576858 -0.9321906 -0.9269013 -0.9280453 -0.9964032 -0.9797621 -0.9533996 -1.003599
data_diagnosis(vstcounts)
## skewness is:
## 0.3270751 0.2803435 0.27878 0.3042513 0.3098411 0.308709 0.2888468 0.312236 0.2699977 0.2917919
## kurtosis is:
## -0.5191474 -0.5951802 -0.6988962 -0.5030264 -0.550126 -0.5454259 -0.6343581 -0.5475308 -0.6730192 -0.6279794
require(graphics)
require(RColorBrewer)
par(mfrow=c(2,3), mar=c(5.1, 4.6, 4.1, 1.6))
draw_PCA(counts, title = "PCA on raw data")
draw_PCA(logcounts, title = "PCA on log transformed data")
draw_PCA(rlogcounts, title = "PCA on rlog transformed data")
draw_PCA(cpmcounts, title = "PCA on cpm transformed data")
draw_PCA(vstcounts, title = "PCA on vst transformed data")
par(mfrow=c(2,3))
plotMDS(counts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on raw data")
plotMDS(logcounts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on log transformed data")
plotMDS(rlogcounts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on rlog transformed data")
plotMDS(cpmcounts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on cpm transformed data")
plotMDS(vstcounts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on vst transformed data")
require(gplots)
draw_corr_heatmap(as.matrix(counts), show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on raw data")
draw_corr_heatmap(as.matrix(logcounts), show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on log transformed data")
draw_corr_heatmap(rlogcounts, show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on rlog transformed data")
draw_corr_heatmap(cpmcounts, show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on cpm transformed data")
draw_corr_heatmap(vstcounts, show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on vst transformed data")
Here I have to relevel the conditions because by default it considers the first factor to be the reference
conditions = factor(c(rep("Ufo",5), rep("Wt",5)))
conditions = relevel(conditions, ref="Wt")
I’m using cpm normalization method, filter out non-expressed genes (0 counts), you can also try more stringent filtering (e.g. cpm > 1 in at least 5 samples)
keep <- rowSums(cpm(counts)>0) >= 5
table(keep)
## keep
## FALSE TRUE
## 3584 23106
keep_true = data.frame(keep[which(keep == TRUE)])
filter = subset(counts, rownames(counts) %in% rownames(keep_true))
require(DESeq)
cds = newCountDataSet(filter, conditions)
cds = estimateSizeFactors(cds)
cds = estimateDispersions(cds)
DESeq_res = nbinomTest(cds, "Wt", "Ufo")
rownames(DESeq_res) = DESeq_res$id
DESeq_DE = subset(DESeq_res, (log2FoldChange < -1 & padj < 0.05) | (log2FoldChange > 1 & padj < 0.05) )
DESeq_nc = counts(cds, normalized = TRUE)
DESeq_nc = data.frame("id" = rownames(DESeq_nc), DESeq_nc)
DESeq_nc_DE = subset(DESeq_nc, id %in% DESeq_DE$id)
require(DESeq2)
colData = data.frame(samples = colnames(filter), conditions = conditions)
dds = DESeqDataSetFromMatrix(countData = filter, colData = colData, design = ~conditions)
## converting counts to integer mode
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
DESeq2_res = results(dds)
DESeq2_res = data.frame("id" = rownames(DESeq2_res), DESeq2_res)
DESeq2_DE = subset(DESeq2_res, (log2FoldChange < -1 & padj < 0.05) | (log2FoldChange > 1 & padj < 0.05) )
DESeq2_nc = counts(dds, normalized = TRUE)
DESeq2_nc = data.frame("id" = rownames(DESeq2_nc), DESeq2_nc)
DESeq2_nc_DE = subset(DESeq2_nc, id %in% DESeq2_DE$id)
require(edgeR)
group = as.vector(conditions)
dge = DGEList(counts = filter, group = group)
dge = calcNormFactors(dge)
dis = estimateCommonDisp(dge)
tag = estimateTagwiseDisp(dis)
etx = exactTest(tag, pair = c("Wt", "Ufo"))
edgeR_res = etx$table
edgeR_res$FDR = p.adjust(edgeR_res$PValue, method = "BH")
edgeR_res = data.frame("id" = rownames(edgeR_res), edgeR_res)
edgeR_DE = subset(edgeR_res, (logFC < -1 & FDR < 0.05) | (logFC > 1 & FDR < 0.05) )
edgeR_nc = tag$pseudo.counts
edgeR_nc = data.frame("id"=rownames(edgeR_nc), edgeR_nc)
edgeR_nc_DE = subset(edgeR_nc, id %in% edgeR_DE$id)
require(limma)
design = model.matrix(~conditions)
voom = voom(filter, design, normalize="quantile")
fit = lmFit(voom, design)
fit = eBayes(fit)
limma_res = topTable(fit, coef = NULL, n = Inf)
## Removing intercept from test coefficients
limma_res = data.frame("id" = rownames(limma_res), limma_res)
limma_DE = subset(limma_res, (logFC < -1 & adj.P.Val < 0.05) | (logFC > 1 & adj.P.Val < 0.05) )
limma_nc = 2**voom$E
limma_nc = data.frame("id" = rownames(limma_nc), limma_nc)
limma_nc_DE = subset(limma_nc, id %in% limma_DE$id)
dflist <- list(DESeq=DESeq_DE, DESeq2=DESeq2_DE, edgeR=edgeR_DE, limma=limma_DE)
Compare <- join_id(dflist)
Compare$message
## [1] "There are 282 genes DE in all 4 methods 114 genes in 3, 245 genes in 2, 136 genes in 1."
require(DT)
summary <- merge(Compare$merged_table, annotations, by.x = "id", by.y = "MSTRG_gene_id", all.x = TRUE)
datatable(summary,
rownames = FALSE,
caption = htmltools::tags$caption(
style = "caption-side: bottom; text-align: left;",
"Table 1: Summary of DEGs found in four different methods."),
options = list(
searchHighlight = TRUE
)
)
summary <- merge(summary, DESeq_res[,c("id", "log2FoldChange")], by = "id")
setnames(summary, old = "log2FoldChange", new = "DESeq_logFC")
summary <- merge(summary, DESeq2_res[,c("id", "log2FoldChange")], by = "id")
setnames(summary, old = "log2FoldChange", new = "DESeq2_logFC")
summary <- merge(summary, edgeR_res[,c("id", "logFC")], by = "id")
setnames(summary, old = "logFC", new = "edgeR_logFC")
summary <- merge(summary, limma_res[,c("id", "logFC")], by = "id")
setnames(summary, old = "logFC", new = "limma_logFC")
summary = summary[,c(1,9,10,11,12,6,7,8)]
summary = aggregate(summary, by = list(summary$id), FUN = aggregate_func)
summary = summary[,c(1,3,4,5,6,7,8,9)]
setnames(summary, old = "Group.1", new = "id")
datatable(summary, filter = "top",
rownames = FALSE,
caption = htmltools::tags$caption(
style = "caption-side: bottom; text-align: left;",
"Table 2: Summary of logFC values of DEGs."),
options = list(
autoWidth = TRUE,
searchHighlight = TRUE,
scrollX = TRUE
)
)
require(VennDiagram)
draw_venndiagram(dflist, Compare$merged_table)
Cluster for Up & Down DEGs, respectively
require(ComplexHeatmap)
Ht1 = DE_heatmap(DESeq_nc_DE, title = "DESeq", km = 2)
Ht2 = DE_heatmap(DESeq2_nc_DE, title = "DESeq2", km = 2)
Ht3 = DE_heatmap(edgeR_nc_DE, title = "edgeR", km = 2)
Ht4 = DE_heatmap(limma_nc_DE, title = "limma", km = 2)
Ht1
Ht2
Ht3
Ht4
Ht1 = DE_heatmap(DESeq_nc_DE, common_id=Compare$common_id, title = "DESeq", km = 2)
Ht2 = DE_heatmap(DESeq2_nc_DE, common_id=Compare$common_id, title = "DESeq2", km = 2)
Ht3 = DE_heatmap(edgeR_nc_DE, common_id=Compare$common_id, title = "edgeR", km = 2)
Ht4 = DE_heatmap(limma_nc_DE, common_id=Compare$common_id, title = "limma", km = 2)
Ht1 + Ht2 + Ht3 + Ht4
par(mfrow=c(2,2), mar=c(5.1, 4.6, 4.1, 1.6))
draw_MA(DESeq_res, type="DESeq")
draw_MA(DESeq2_res, type="DESeq2")
draw_MA(edgeR_res, type="edgeR")
draw_MA(limma_res, type="limma")
par(mfrow=c(2,2), mar=c(5.1, 4.6, 4.1, 1.6))
draw_volcano(DESeq_res, type="DESeq")
draw_volcano(DESeq2_res, type="DESeq2")
draw_volcano(edgeR_res, type="edgeR")
draw_volcano(limma_res, type="limma")
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libLAPACK.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] VennDiagram_1.6.20 futile.logger_1.4.3
## [3] plyr_1.8.4 ComplexHeatmap_1.20.0
## [5] e1071_1.7-0 data.table_1.11.8
## [7] DESeq2_1.22.1 SummarizedExperiment_1.12.0
## [9] DelayedArray_0.8.0 BiocParallel_1.16.0
## [11] matrixStats_0.54.0 GenomicRanges_1.34.0
## [13] GenomeInfoDb_1.18.1 IRanges_2.16.0
## [15] S4Vectors_0.20.1 DESeq_1.34.0
## [17] lattice_0.20-38 locfit_1.5-9.1
## [19] Biobase_2.42.0 BiocGenerics_0.28.0
## [21] cowplot_0.9.3 DT_0.5
## [23] reshape2_1.4.3 ggplot2_3.1.0
## [25] gplots_3.0.1 RColorBrewer_1.1-2
## [27] pacman_0.5.0 edgeR_3.24.0
## [29] limma_3.38.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 rjson_0.2.20 class_7.3-14
## [4] rprojroot_1.3-2 circlize_0.4.5 htmlTable_1.12
## [7] XVector_0.22.0 GlobalOptions_0.1.0 base64enc_0.1-3
## [10] rstudioapi_0.8 bit64_0.9-7 AnnotationDbi_1.44.0
## [13] splines_3.5.1 geneplotter_1.60.0 knitr_1.20
## [16] Formula_1.2-3 jsonlite_1.5 annotate_1.60.0
## [19] cluster_2.0.7-1 shiny_1.2.0 BiocManager_1.30.4
## [22] compiler_3.5.1 backports_1.1.2 assertthat_0.2.0
## [25] Matrix_1.2-15 lazyeval_0.2.1 formatR_1.5
## [28] later_0.7.5 acepack_1.4.1 htmltools_0.3.6
## [31] tools_3.5.1 bindrcpp_0.2.2 gtable_0.2.0
## [34] glue_1.3.0 GenomeInfoDbData_1.2.0 dplyr_0.7.8
## [37] Rcpp_1.0.0 gdata_2.18.0 crosstalk_1.0.0
## [40] stringr_1.3.1 mime_0.6 gtools_3.8.1
## [43] XML_3.98-1.16 zlibbioc_1.28.0 scales_1.0.0
## [46] promises_1.0.1 lambda.r_1.2.3 yaml_2.2.0
## [49] curl_3.2 memoise_1.1.0 gridExtra_2.3
## [52] rpart_4.1-13 latticeExtra_0.6-28 stringi_1.2.5
## [55] RSQLite_2.1.1 genefilter_1.64.0 checkmate_1.8.5
## [58] caTools_1.17.1.1 shape_1.4.4 rlang_0.3.0.1
## [61] pkgconfig_2.0.2 bitops_1.0-6 evaluate_0.12
## [64] purrr_0.2.5 bindr_0.1.1 htmlwidgets_1.3
## [67] labeling_0.3 bit_1.1-14 tidyselect_0.2.5
## [70] magrittr_1.5 R6_2.3.0 Hmisc_4.1-1
## [73] DBI_1.0.0 pillar_1.3.0 foreign_0.8-71
## [76] withr_2.1.2 survival_2.43-1 RCurl_1.95-4.11
## [79] nnet_7.3-12 tibble_1.4.2 crayon_1.3.4
## [82] futile.options_1.0.1 KernSmooth_2.23-15 rmarkdown_1.10
## [85] GetoptLong_0.1.7 blob_1.1.1 digest_0.6.18
## [88] xtable_1.8-3 httpuv_1.4.5 munsell_0.5.0